import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import plotly.express as px
sns.set_style('whitegrid')
plt.rcParams['font.size']=12
plt.rcParams['figure.figsize']=(15,5)
df=pd.read_csv(r'C:\Users\TajwarAbtahee\OneDrive - JCW Resourcing\Desktop\Python\practice\world-happiness-report-2021.csv')
df.head()
| Country name | Regional indicator | Ladder score | Standard error of ladder score | upperwhisker | lowerwhisker | Logged GDP per capita | Social support | Healthy life expectancy | Freedom to make life choices | Generosity | Perceptions of corruption | Ladder score in Dystopia | Explained by: Log GDP per capita | Explained by: Social support | Explained by: Healthy life expectancy | Explained by: Freedom to make life choices | Explained by: Generosity | Explained by: Perceptions of corruption | Dystopia + residual | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Finland | Western Europe | 7.842 | 0.032 | 7.904 | 7.780 | 10.775 | 0.954 | 72.0 | 0.949 | -0.098 | 0.186 | 2.43 | 1.446 | 1.106 | 0.741 | 0.691 | 0.124 | 0.481 | 3.253 |
| 1 | Denmark | Western Europe | 7.620 | 0.035 | 7.687 | 7.552 | 10.933 | 0.954 | 72.7 | 0.946 | 0.030 | 0.179 | 2.43 | 1.502 | 1.108 | 0.763 | 0.686 | 0.208 | 0.485 | 2.868 |
| 2 | Switzerland | Western Europe | 7.571 | 0.036 | 7.643 | 7.500 | 11.117 | 0.942 | 74.4 | 0.919 | 0.025 | 0.292 | 2.43 | 1.566 | 1.079 | 0.816 | 0.653 | 0.204 | 0.413 | 2.839 |
| 3 | Iceland | Western Europe | 7.554 | 0.059 | 7.670 | 7.438 | 10.878 | 0.983 | 73.0 | 0.955 | 0.160 | 0.673 | 2.43 | 1.482 | 1.172 | 0.772 | 0.698 | 0.293 | 0.170 | 2.967 |
| 4 | Netherlands | Western Europe | 7.464 | 0.027 | 7.518 | 7.410 | 10.932 | 0.942 | 72.4 | 0.913 | 0.175 | 0.338 | 2.43 | 1.501 | 1.079 | 0.753 | 0.647 | 0.302 | 0.384 | 2.798 |
df.isnull().sum()
Country name 0 Regional indicator 0 Ladder score 0 Standard error of ladder score 0 upperwhisker 0 lowerwhisker 0 Logged GDP per capita 0 Social support 0 Healthy life expectancy 0 Freedom to make life choices 0 Generosity 0 Perceptions of corruption 0 Ladder score in Dystopia 0 Explained by: Log GDP per capita 0 Explained by: Social support 0 Explained by: Healthy life expectancy 0 Explained by: Freedom to make life choices 0 Explained by: Generosity 0 Explained by: Perceptions of corruption 0 Dystopia + residual 0 dtype: int64
data_cols=['Country name','Regional indicator','Ladder score','Logged GDP per capita','Social support','Healthy life expectancy','Freedom to make life choices', 'Generosity','Perceptions of corruption']
df=df[data_cols].copy()
#top 20 countries by happniess rank
top=df.sort_values('Ladder score',ascending=False).head(20)
sns.barplot(data=top,x='Country name',y='Ladder score')
plt.xticks(rotation=30)
plt.ylim(6.75,8)
plt.show()
print(top['Regional indicator'].value_counts())
#5 scandanavian countries are in top 7
#majority of the countries are from Western Europe
Western Europe 13 North America and ANZ 4 Middle East and North Africa 1 Central and Eastern Europe 1 Latin America and Caribbean 1 Name: Regional indicator, dtype: int64
#GDP vs Happniness
px.scatter(df,x='Logged GDP per capita',y='Ladder score',trendline='ols')
#as we can see both variables are strongly positively correlated
#lets explore other correlations in the data
df.corr().style.background_gradient(cmap='RdYlGn')
#Ladder score(happiness) has the strongest realtion with GDP per capita
#after that its life expsctancy and social support
#generoisity seems to be very low across the board
| Ladder score | Logged GDP per capita | Social support | Healthy life expectancy | Freedom to make life choices | Generosity | Perceptions of corruption | |
|---|---|---|---|---|---|---|---|
| Ladder score | 1.000000 | 0.789760 | 0.756888 | 0.768099 | 0.607753 | -0.017799 | -0.421140 |
| Logged GDP per capita | 0.789760 | 1.000000 | 0.785299 | 0.859461 | 0.432323 | -0.199286 | -0.342337 |
| Social support | 0.756888 | 0.785299 | 1.000000 | 0.723256 | 0.482930 | -0.114946 | -0.203207 |
| Healthy life expectancy | 0.768099 | 0.859461 | 0.723256 | 1.000000 | 0.461494 | -0.161750 | -0.364374 |
| Freedom to make life choices | 0.607753 | 0.432323 | 0.482930 | 0.461494 | 1.000000 | 0.169437 | -0.401363 |
| Generosity | -0.017799 | -0.199286 | -0.114946 | -0.161750 | 0.169437 | 1.000000 | -0.163962 |
| Perceptions of corruption | -0.421140 | -0.342337 | -0.203207 | -0.364374 | -0.401363 | -0.163962 | 1.000000 |
sns.scatterplot(data=df,x='Logged GDP per capita',y='Ladder score',hue='Regional indicator',s=200)
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
plt.show()
px.scatter(df,x='Logged GDP per capita',y='Ladder score',color='Regional indicator',hover_name='Country name')
# as we can see african countries are on the lower end whereas western european countries are on the upper end
# Latin american and Carribean countries have above average happniess
# Afghanistan seems very far off from its regional cluster
#regional GDP contribution
gdp=df.groupby('Regional indicator')['Logged GDP per capita'].sum().reset_index()
gdp
| Regional indicator | Logged GDP per capita | |
|---|---|---|
| 0 | Central and Eastern Europe | 171.854 |
| 1 | Commonwealth of Independent States | 112.822 |
| 2 | East Asia | 62.206 |
| 3 | Latin America and Caribbean | 187.400 |
| 4 | Middle East and North Africa | 164.324 |
| 5 | North America and ANZ | 43.238 |
| 6 | South Asia | 60.778 |
| 7 | Southeast Asia | 84.793 |
| 8 | Sub-Saharan Africa | 290.707 |
| 9 | Western Europe | 227.277 |
px.pie(gdp,names='Regional indicator',values='Logged GDP per capita')
#subharan africa and western european countries are contributing the most
#total countries from each region
df['Regional indicator'].value_counts()
#as we can see this relates directly to the previous chart since we have african countries that are most prevalent in this dataset
Sub-Saharan Africa 36 Western Europe 21 Latin America and Caribbean 20 Middle East and North Africa 17 Central and Eastern Europe 17 Commonwealth of Independent States 12 Southeast Asia 9 South Asia 7 East Asia 6 North America and ANZ 4 Name: Regional indicator, dtype: int64
#corruption in regions
corrupt=df.groupby('Regional indicator')['Perceptions of corruption'].mean().round(2).reset_index().sort_values('Perceptions of corruption', ascending=False)
corrupt
| Regional indicator | Perceptions of corruption | |
|---|---|---|
| 0 | Central and Eastern Europe | 0.85 |
| 6 | South Asia | 0.80 |
| 3 | Latin America and Caribbean | 0.79 |
| 8 | Sub-Saharan Africa | 0.77 |
| 4 | Middle East and North Africa | 0.76 |
| 1 | Commonwealth of Independent States | 0.73 |
| 7 | Southeast Asia | 0.71 |
| 2 | East Asia | 0.68 |
| 9 | Western Europe | 0.52 |
| 5 | North America and ANZ | 0.45 |
ax=sns.barplot(data=corrupt,x='Regional indicator',y='Perceptions of corruption')
ax.bar_label(container=ax.containers[0],labels=corrupt['Perceptions of corruption'])
plt.xticks(rotation='vertical')
plt.show()
top10
| Country name | Regional indicator | Ladder score | Logged GDP per capita | Social support | Healthy life expectancy | Freedom to make life choices | Generosity | Perceptions of corruption | |
|---|---|---|---|---|---|---|---|---|---|
| 2 | Switzerland | Western Europe | 7.571 | 11.117 | 0.942 | 74.4 | 0.919 | 0.025 | 0.292 |
| 8 | New Zealand | North America and ANZ | 7.277 | 10.643 | 0.948 | 73.4 | 0.929 | 0.134 | 0.242 |
| 5 | Norway | Western Europe | 7.392 | 11.053 | 0.954 | 73.3 | 0.960 | 0.093 | 0.270 |
| 9 | Austria | Western Europe | 7.268 | 10.906 | 0.934 | 73.3 | 0.908 | 0.042 | 0.481 |
| 3 | Iceland | Western Europe | 7.554 | 10.878 | 0.983 | 73.0 | 0.955 | 0.160 | 0.673 |
| 1 | Denmark | Western Europe | 7.620 | 10.933 | 0.954 | 72.7 | 0.946 | 0.030 | 0.179 |
| 6 | Sweden | Western Europe | 7.363 | 10.867 | 0.934 | 72.7 | 0.945 | 0.086 | 0.237 |
| 7 | Luxembourg | Western Europe | 7.324 | 11.647 | 0.908 | 72.6 | 0.907 | -0.034 | 0.386 |
| 4 | Netherlands | Western Europe | 7.464 | 10.932 | 0.942 | 72.4 | 0.913 | 0.175 | 0.338 |
| 0 | Finland | Western Europe | 7.842 | 10.775 | 0.954 | 72.0 | 0.949 | -0.098 | 0.186 |
#life expectancy of top and bottom 10 countries
top10=df.head(10)
top10=top10.sort_values('Healthy life expectancy',ascending=False)
bot10=df.tail(10)
bot10=bot10.sort_values('Healthy life expectancy',ascending=False)
fig, axes=plt.subplots(1,2)
xlabels=top10['Country name']
t=sns.barplot(data=top10,x='Country name',y='Healthy life expectancy',ax=axes[0])
t.bar_label(container=t.containers[0],labels=top10['Healthy life expectancy'])
axes[0].set_xticklabels(xlabels,rotation=30)
axes[0].set_ylim(71,75)
axes[0].set_title('Top 10 Happy Countries')
xlabels=bot10['Country name']
b=sns.barplot(data=bot10,x='Country name',y='Healthy life expectancy',ax=axes[1])
b.bar_label(container=b.containers[0],labels=bot10['Healthy life expectancy'])
axes[1].set_xticklabels(xlabels,rotation=30)
axes[1].set_ylim(45,65)
axes[1].set_title('Bottom 10 Happy Countries')
plt.show()
#top 10 countries have ages all above 70 with a smaller ranger showing more consistency
#bottom 10 countries have most expectancies between 50-60
sns.pairplot(df,hue='Regional indicator')
#as we can see with the majority of relationships Western Europe tends to have the best position
#this is the inverse relationship with sub-saharan african countries
<seaborn.axisgrid.PairGrid at 0x1ea28bdf640>
df.head()
| Country name | Regional indicator | Ladder score | Logged GDP per capita | Social support | Healthy life expectancy | Freedom to make life choices | Generosity | Perceptions of corruption | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | Finland | Western Europe | 7.842 | 10.775 | 0.954 | 72.0 | 0.949 | -0.098 | 0.186 |
| 1 | Denmark | Western Europe | 7.620 | 10.933 | 0.954 | 72.7 | 0.946 | 0.030 | 0.179 |
| 2 | Switzerland | Western Europe | 7.571 | 11.117 | 0.942 | 74.4 | 0.919 | 0.025 | 0.292 |
| 3 | Iceland | Western Europe | 7.554 | 10.878 | 0.983 | 73.0 | 0.955 | 0.160 | 0.673 |
| 4 | Netherlands | Western Europe | 7.464 | 10.932 | 0.942 | 72.4 | 0.913 | 0.175 | 0.338 |
#lets compare countries wrt. perception of corruption
corrtop=df.sort_values('Perceptions of corruption').head(10)
corrbot=df.sort_values('Perceptions of corruption',ascending=False).head(10)
fig,axes=plt.subplots(1,2)
xlabels=corrtop['Country name']
a=sns.barplot(data=corrtop,x='Country name',y='Perceptions of corruption',ax=axes[0])
a.bar_label(container=a.containers[0],labels=corrtop['Perceptions of corruption'])
axes[0].set_xticklabels(xlabels,rotation=90)
axes[0].set_title('Top 10 Least Corrupt')
xlabels=corrbot['Country name']
b=sns.barplot(data=corrbot,x='Country name',y='Perceptions of corruption',ax=axes[1])
b.bar_label(container=b.containers[0],labels=corrbot['Perceptions of corruption'])
axes[1].set_xticklabels(xlabels,rotation=90)
axes[1].set_title('Top 10 Most Corrupt')
plt.ylim(0.9,0.95)
plt.show()
print(corrtop['Regional indicator'].value_counts())
print(corrbot['Regional indicator'].value_counts())
#Singapore is the least corrupt by a large margin, however most least-corrupt countries are from western Europe
# Central and Eastern Europe has the highest number of corrupt countries, and all countries and are above 0.9 with a small range
#referring back to our pairplot, the higher the corruption the lower the countries happiness
Western Europe 7 Sub-Saharan Africa 1 Southeast Asia 1 North America and ANZ 1 Name: Regional indicator, dtype: int64 Central and Eastern Europe 6 Commonwealth of Independent States 2 South Asia 1 Sub-Saharan Africa 1 Name: Regional indicator, dtype: int64